In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('city.csv', sep=';')
df
Out[2]:
ID Name CountryCode District Population
0 1 Kabul AFG Kabol 1780000
1 2 Qandahar AFG Qandahar 237500
2 3 Herat AFG Herat 186800
3 4 Mazar-e-Sharif AFG Balkh 127800
4 5 Amsterdam NLD Noord-Holland 731200
... ... ... ... ... ...
4074 4075 Khan Yunis PSE Khan Yunis 123175
4075 4076 Hebron PSE Hebron 119401
4076 4077 Jabaliya PSE North Gaza 113901
4077 4078 Nablus PSE Nablus 100231
4078 4079 Rafah PSE Rafah 92020

4079 rows × 5 columns

In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4079 entries, 0 to 4078
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           4079 non-null   int64 
 1   Name         4079 non-null   object
 2   CountryCode  4079 non-null   object
 3   District     4075 non-null   object
 4   Population   4079 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 159.5+ KB
In [8]:
df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4079 entries, 0 to 4078
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           4079 non-null   int64 
 1   Name         4079 non-null   object
 2   CountryCode  4079 non-null   object
 3   District     4075 non-null   object
 4   Population   4079 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 846.9 KB
In [9]:
df.memory_usage()
Out[9]:
Index            128
ID             32632
Name           32632
CountryCode    32632
District       32632
Population     32632
dtype: int64
In [12]:
df.memory_usage(deep=True)
Out[12]:
Index             128
ID              32632
Name           277922
CountryCode    244740
District       279127
Population      32632
dtype: int64
In [13]:
df['CountryCode'] = df.CountryCode.astype('category')
In [14]:
df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4079 entries, 0 to 4078
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   ID           4079 non-null   int64   
 1   Name         4079 non-null   object  
 2   CountryCode  4079 non-null   category
 3   District     4075 non-null   object  
 4   Population   4079 non-null   int64   
dtypes: category(1), int64(2), object(2)
memory usage: 639.4 KB
In [15]:
df.memory_usage(deep=True)
Out[15]:
Index             128
ID              32632
Name           277922
CountryCode     32318
District       279127
Population      32632
dtype: int64
In [16]:
df2 = pd.read_csv('city.csv', sep=';')
df2
Out[16]:
ID Name CountryCode District Population
0 1 Kabul AFG Kabol 1780000
1 2 Qandahar AFG Qandahar 237500
2 3 Herat AFG Herat 186800
3 4 Mazar-e-Sharif AFG Balkh 127800
4 5 Amsterdam NLD Noord-Holland 731200
... ... ... ... ... ...
4074 4075 Khan Yunis PSE Khan Yunis 123175
4075 4076 Hebron PSE Hebron 119401
4076 4077 Jabaliya PSE North Gaza 113901
4077 4078 Nablus PSE Nablus 100231
4078 4079 Rafah PSE Rafah 92020

4079 rows × 5 columns

In [18]:
df2.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4079 entries, 0 to 4078
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           4079 non-null   int64 
 1   Name         4079 non-null   object
 2   CountryCode  4079 non-null   object
 3   District     4075 non-null   object
 4   Population   4079 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 846.9 KB
In [22]:
%%timeit
df2.groupby('CountryCode').count()
1.93 ms ± 50.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [23]:
%%timeit
df.groupby('CountryCode').count()
1.44 ms ± 57.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [27]:
df.CountryCode.cat.codes
Out[27]:
0         1
1         1
2         1
3         1
4       153
       ... 
4074    171
4075    171
4076    171
4077    171
4078    171
Length: 4079, dtype: int16
In [28]:
df.CountryCode.cat.categories
Out[28]:
Index(['ABW', 'AFG', 'AGO', 'AIA', 'ALB', 'AND', 'ANT', 'ARE', 'ARG', 'ARM',
       ...
       'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'YUG', 'ZAF', 'ZMB', 'ZWE'],
      dtype='object', length=232)
In [ ]:
 
In [29]:
df3 = pd.DataFrame({
    'name': ['John', 'Jack', 'Katy', 'Paul', 'Susan'],
    'mark': ['good', 'excellent', 'bad', 'middle', 'good']
})
df3
Out[29]:
name mark
0 John good
1 Jack excellent
2 Katy bad
3 Paul middle
4 Susan good
In [32]:
def get_mark(v):
    if v == 'good':
        return 4
    elif v == 'excellent':
        return 5
    elif v == 'bad':
        return 2
    else:
        return 3
    
df3['rate'] = df3.mark.apply(get_mark)
In [33]:
df3
Out[33]:
name mark rate
0 John good 4
1 Jack excellent 5
2 Katy bad 2
3 Paul middle 3
4 Susan good 4
In [34]:
df3[ df3.rate > 3 ]
Out[34]:
name mark rate
0 John good 4
1 Jack excellent 5
4 Susan good 4
In [ ]:
 
In [35]:
df3 = pd.DataFrame({
    'name': ['John', 'Jack', 'Katy', 'Paul', 'Susan'],
    'mark': ['good', 'excellent', 'bad', 'middle', 'good']
})
df3
Out[35]:
name mark
0 John good
1 Jack excellent
2 Katy bad
3 Paul middle
4 Susan good
In [36]:
d_mark = {'good': 4, 'excellent': 5, 'bad': 2, 'middle': 3}
df3['rate'] = df3.mark.map(d_mark)
In [37]:
df3
Out[37]:
name mark rate
0 John good 4
1 Jack excellent 5
2 Katy bad 2
3 Paul middle 3
4 Susan good 4
In [ ]:
 
In [38]:
df3 = pd.DataFrame({
    'name': ['John', 'Jack', 'Katy', 'Paul', 'Susan'],
    'mark': ['good', 'excellent', 'bad', 'middle', 'good']
})
df3
Out[38]:
name mark
0 John good
1 Jack excellent
2 Katy bad
3 Paul middle
4 Susan good
In [39]:
df3['mark'] = pd.Categorical(df3.mark, categories=['bad', 'middle', 'good', 'excellent'], ordered=True)
In [40]:
df3.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   name    5 non-null      object  
 1   mark    5 non-null      category
dtypes: category(1), object(1)
memory usage: 365.0+ bytes
In [41]:
df3.mark
Out[41]:
0         good
1    excellent
2          bad
3       middle
4         good
Name: mark, dtype: category
Categories (4, object): [bad < middle < good < excellent]
In [43]:
df3[ df3.mark > 'middle' ].sort_values(by='mark', ascending=False)
Out[43]:
name mark
1 Jack excellent
0 John good
4 Susan good
In [46]:
df3.mark.max()
Out[46]:
'excellent'
In [ ]: